notebook.community



In [78]:

    
import graphlab



In [79]:

    
graphlab.canvas.set_target('ipynb')



In [80]:

    
homeData= graphlab.SFrame('home_data.gl/')



In [81]:

    
homeData









    Out[81]:





    
        id
        date
        price
        bedrooms
        bathrooms
        sqft_living
        sqft_lot
        floors
        waterfront
    
    
        7129300520
        2014-10-13 00:00:00+00:00
        221900
        3
        1
        1180
        5650
        1
        0
    
    
        6414100192
        2014-12-09 00:00:00+00:00
        538000
        3
        2.25
        2570
        7242
        2
        0
    
    
        5631500400
        2015-02-25 00:00:00+00:00
        180000
        2
        1
        770
        10000
        1
        0
    
    
        2487200875
        2014-12-09 00:00:00+00:00
        604000
        4
        3
        1960
        5000
        1
        0
    
    
        1954400510
        2015-02-18 00:00:00+00:00
        510000
        3
        2
        1680
        8080
        1
        0
    
    
        7237550310
        2014-05-12 00:00:00+00:00
        1225000
        4
        4.5
        5420
        101930
        1
        0
    
    
        1321400060
        2014-06-27 00:00:00+00:00
        257500
        3
        2.25
        1715
        6819
        2
        0
    
    
        2008000270
        2015-01-15 00:00:00+00:00
        291850
        3
        1.5
        1060
        9711
        1
        0
    
    
        2414600126
        2015-04-15 00:00:00+00:00
        229500
        3
        1
        1780
        7470
        1
        0
    
    
        3793500160
        2015-03-12 00:00:00+00:00
        323000
        3
        2.5
        1890
        6560
        2
        0
    


    
        view
        condition
        grade
        sqft_above
        sqft_basement
        yr_built
        yr_renovated
        zipcode
        lat
    
    
        0
        3
        7
        1180
        0
        1955
        0
        98178
        47.51123398
    
    
        0
        3
        7
        2170
        400
        1951
        1991
        98125
        47.72102274
    
    
        0
        3
        6
        770
        0
        1933
        0
        98028
        47.73792661
    
    
        0
        5
        7
        1050
        910
        1965
        0
        98136
        47.52082
    
    
        0
        3
        8
        1680
        0
        1987
        0
        98074
        47.61681228
    
    
        0
        3
        11
        3890
        1530
        2001
        0
        98053
        47.65611835
    
    
        0
        3
        7
        1715
        0
        1995
        0
        98003
        47.30972002
    
    
        0
        3
        7
        1060
        0
        1963
        0
        98198
        47.40949984
    
    
        0
        3
        7
        1050
        730
        1960
        0
        98146
        47.51229381
    
    
        0
        3
        7
        1890
        0
        2003
        0
        98038
        47.36840673
    


    
        long
        sqft_living15
        sqft_lot15
    
    
        -122.25677536
        1340.0
        5650.0
    
    
        -122.3188624
        1690.0
        7639.0
    
    
        -122.23319601
        2720.0
        8062.0
    
    
        -122.39318505
        1360.0
        5000.0
    
    
        -122.04490059
        1800.0
        7503.0
    
    
        -122.00528655
        4760.0
        101930.0
    
    
        -122.32704857
        2238.0
        6819.0
    
    
        -122.31457273
        1650.0
        9711.0
    
    
        -122.33659507
        1780.0
        8113.0
    
    
        -122.0308176
        2390.0
        7570.0
    

[21613 rows x 21 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.

Question -1

Find the zip code with the highest average house sales price (from the earlier work sheet) and within the houses in that zip code, compute the average price



In [82]:

    
import graphlab.aggregate as agg
homeData.groupby(key_columns='zipcode',operations={'avg_sales_price' : agg.MEAN('price')})









    Out[82]:





    
        zipcode
        avg_sales_price
    
    
        98033
        803719.532407
    
    
        98032
        251296.24
    
    
        98065
        527961.203226
    
    
        98077
        682774.878788
    
    
        98144
        594547.641399
    
    
        98136
        551688.673004
    
    
        98115
        619900.5506
    
    
        98075
        790576.668524
    
    
        98034
        521652.858716
    
    
        98058
        353608.635165
    

[70 rows x 2 columns]
Note: Only the head of the SFrame is printed.
You can use print_rows(num_rows=m, num_columns=n) to print more rows and columns.



In [83]:

    
import numpy as np
np.average(homeData.filter_by(['98033'],'zipcode')['price'])









    Out[83]:





803719.53240740742



In [84]:

    
def is_valid_home(sqft):
    return (sqft >2000) & (sqft <4000)



In [85]:

    
q2homes = homeData[homeData['sqft_living'].apply(lambda x : is_valid_home(x))]



In [86]:

    
len(q2homes)









    Out[86]:





9111



In [87]:

    
len(homeData)









    Out[87]:





21613



In [88]:

    
(len(q2homes)/float(len(homeData)))*100









    Out[88]:





42.155184379771434



In [89]:

    
advanced_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode',
                     'condition', # condition of house
                     'grade', # measure of quality of construction
                     'waterfront', # waterfront property
                     'view', # type of view
                     'sqft_above', # square feet above ground
                     'sqft_basement', # square feet in basement
                     'yr_built', # the year built
                     'yr_renovated', # the year renovated
                     'lat', 'long', # the lat-long of the parcel
                     'sqft_living15', # average sq.ft. of 15 nearest neighbors 
                     'sqft_lot15' # average lot size of 15 nearest neighbors 
]

Create the test train split



In [90]:

    
train_data, test_data = homeData.random_split(0.7,seed=0)



In [91]:

    
my_features = ['bedrooms', 'bathrooms', 'sqft_living', 'sqft_lot', 'floors', 'zipcode']



In [92]:

    
my_features_model = graphlab.linear_regression.create(train_data,target = 'price',features=my_features)









    



PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 14542
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.029961     | 3727238.710196     | 1017359.987928       | 180643.719657 | 143067.739283   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+



In [93]:

    
adv_feature_model =  graphlab.linear_regression.create(train_data,target='price',features=advanced_features)









    



PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 14430
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.039397     | 3455720.025271     | 1406097.524156       | 152565.847787 | 137216.740082   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+



In [94]:

    
print my_features_model.evaluate(test_data)









    



{'max_error': 5280200.637821782, 'rmse': 197354.96729077716}



In [95]:

    
print adv_feature_model.evaluate(test_data)









    



{'max_error': 5097978.206195663, 'rmse': 175532.00695222418}



In [96]:

    
my_features_model.evaluate(test_data).get('rmse')- adv_feature_model.evaluate(test_data).get('rmse')









    Out[96]:





21822.960338552977



In [97]:

    
train_data_1, test_data_1 = homeData.random_split(0.8,seed=0)
my_features_model_1 = graphlab.linear_regression.create(train_data_1,target = 'price',features=my_features)
adv_feature_model_1 =  graphlab.linear_regression.create(train_data_1,target='price',features=advanced_features)









    



PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16455
PROGRESS: Number of features          : 6
PROGRESS: Number of unpacked features : 6
PROGRESS: Number of coefficients    : 115
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.030532     | 3743119.706075     | 2061742.555508       | 182260.544700 | 176889.975691   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: Creating a validation set from 5 percent of training data. This may take a while.
          You can set ``validation_set=None`` to disable validation tracking.

PROGRESS: Linear regression:
PROGRESS: --------------------------------------------------------
PROGRESS: Number of examples          : 16480
PROGRESS: Number of features          : 18
PROGRESS: Number of unpacked features : 18
PROGRESS: Number of coefficients    : 127
PROGRESS: Starting Newton Method
PROGRESS: --------------------------------------------------------
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | Iteration | Passes   | Elapsed Time | Training-max_error | Validation-max_error | Training-rmse | Validation-rmse |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+
PROGRESS: | 1         | 2        | 0.037638     | 3474016.382253     | 1394980.351903       | 154597.533311 | 155621.449739   |
PROGRESS: +-----------+----------+--------------+--------------------+----------------------+---------------+-----------------+



In [98]:

    
print my_features_model_1.evaluate(test_data_1)
print adv_feature_model_1.evaluate(test_data_1)









    



{'max_error': 3464315.0413833335, 'rmse': 179142.60767619964}
{'max_error': 3558676.7762994887, 'rmse': 156717.7897720566}



In [99]:

    
my_features_model_1.evaluate(test_data_1).get('rmse')- adv_feature_model_1.evaluate(test_data_1).get('rmse')









    Out[99]:





22424.81790414304



In [ ]:

id	date	price	bedrooms	bathrooms	sqft_living	sqft_lot	floors
7129300520	2014-10-13 00:00:00+00:00	221900	3	1	1180	5650	1
6414100192	2014-12-09 00:00:00+00:00	538000	3	2.25	2570	7242	2
5631500400	2015-02-25 00:00:00+00:00	180000	2	1	770	10000	1
2487200875	2014-12-09 00:00:00+00:00	604000	4	3	1960	5000	1
1954400510	2015-02-18 00:00:00+00:00	510000	3	2	1680	8080	1
7237550310	2014-05-12 00:00:00+00:00	1225000	4	4.5	5420	101930	1
1321400060	2014-06-27 00:00:00+00:00	257500	3	2.25	1715	6819	2
2008000270	2015-01-15 00:00:00+00:00	291850	3	1.5	1060	9711	1
2414600126	2015-04-15 00:00:00+00:00	229500	3	1	1780	7470	1
3793500160	2015-03-12 00:00:00+00:00	323000	3	2.5	1890	6560	2

condition	grade	sqft_above	sqft_basement	yr_built	yr_renovated	zipcode	lat
3	7	1180	0	1955	0	98178	47.51123398
3	7	2170	400	1951	1991	98125	47.72102274
3	6	770	0	1933	0	98028	47.73792661
5	7	1050	910	1965	0	98136	47.52082
3	8	1680	0	1987	0	98074	47.61681228
3	11	3890	1530	2001	0	98053	47.65611835
3	7	1715	0	1995	0	98003	47.30972002
3	7	1060	0	1963	0	98198	47.40949984
3	7	1050	730	1960	0	98146	47.51229381
3	7	1890	0	2003	0	98038	47.36840673

long	sqft_living15	sqft_lot15
-122.25677536	1340.0	5650.0
-122.3188624	1690.0	7639.0
-122.23319601	2720.0	8062.0
-122.39318505	1360.0	5000.0
-122.04490059	1800.0	7503.0
-122.00528655	4760.0	101930.0
-122.32704857	2238.0	6819.0
-122.31457273	1650.0	9711.0
-122.33659507	1780.0	8113.0
-122.0308176	2390.0	7570.0

zipcode	avg_sales_price
98033	803719.532407
98032	251296.24
98065	527961.203226
98077	682774.878788
98144	594547.641399
98136	551688.673004
98115	619900.5506
98075	790576.668524
98034	521652.858716
98058	353608.635165